import os
import pandas as pd
import statsmodels.api as sm
import scipy.stats as st
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.api import het_breuschpagan
from statsmodels.compat import lzip
(a) (10%) Show the results of regression analysis as follows
df = pd.read_csv(os.path.join('Assignment1_Data', "MDS_Assignment1_furnace.csv"))
pred = df.grade
X = []
D = df[df.columns[:28]]
X = sm.add_constant(D)
est = sm.OLS(pred, X)
summary = est.fit().summary()
summary
| Dep. Variable: | grade | R-squared: | 0.495 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.472 |
| Method: | Least Squares | F-statistic: | 21.52 |
| Date: | Thu, 06 Oct 2022 | Prob (F-statistic): | 1.16e-70 |
| Time: | 11:48:50 | Log-Likelihood: | -381.52 |
| No. Observations: | 620 | AIC: | 819.0 |
| Df Residuals: | 592 | BIC: | 943.1 |
| Df Model: | 27 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 2.0339 | 0.018 | 110.528 | 0.000 | 1.998 | 2.070 |
| f0 | -0.0055 | 0.025 | -0.224 | 0.823 | -0.054 | 0.043 |
| f1 | 0.0440 | 0.028 | 1.571 | 0.117 | -0.011 | 0.099 |
| f2 | 0.3140 | 0.035 | 9.028 | 0.000 | 0.246 | 0.382 |
| f3 | 0.0186 | 0.042 | 0.447 | 0.655 | -0.063 | 0.100 |
| f4 | -0.0035 | 0.038 | -0.091 | 0.928 | -0.078 | 0.072 |
| f5 | -0.0740 | 0.030 | -2.483 | 0.013 | -0.133 | -0.015 |
| f6 | -0.0710 | 0.026 | -2.742 | 0.006 | -0.122 | -0.020 |
| f7 | 0.0235 | 0.028 | 0.853 | 0.394 | -0.031 | 0.078 |
| f8 | 0.0410 | 0.019 | 2.170 | 0.030 | 0.004 | 0.078 |
| f9 | 2.675e-16 | 4.05e-17 | 6.600 | 0.000 | 1.88e-16 | 3.47e-16 |
| f10 | -0.0446 | 0.021 | -2.119 | 0.035 | -0.086 | -0.003 |
| f11 | -0.0292 | 0.020 | -1.438 | 0.151 | -0.069 | 0.011 |
| f12 | -0.0006 | 0.022 | -0.027 | 0.979 | -0.044 | 0.043 |
| f13 | 0.0336 | 0.024 | 1.412 | 0.159 | -0.013 | 0.080 |
| f14 | -0.1832 | 0.021 | -8.898 | 0.000 | -0.224 | -0.143 |
| f15 | -0.1061 | 0.019 | -5.565 | 0.000 | -0.144 | -0.069 |
| f16 | -0.0358 | 0.020 | -1.756 | 0.080 | -0.076 | 0.004 |
| f17 | 0.0633 | 0.019 | 3.409 | 0.001 | 0.027 | 0.100 |
| f18 | -0.1904 | 0.021 | -9.194 | 0.000 | -0.231 | -0.150 |
| f19 | 0.0278 | 0.026 | 1.051 | 0.294 | -0.024 | 0.080 |
| f20 | 0.0126 | 0.020 | 0.644 | 0.520 | -0.026 | 0.051 |
| f21 | -0.0357 | 0.028 | -1.263 | 0.207 | -0.091 | 0.020 |
| f22 | 0.0747 | 0.021 | 3.533 | 0.000 | 0.033 | 0.116 |
| f23 | -0.0088 | 0.020 | -0.442 | 0.659 | -0.048 | 0.030 |
| f24 | 0.0193 | 0.024 | 0.800 | 0.424 | -0.028 | 0.067 |
| f25 | -0.0679 | 0.020 | -3.406 | 0.001 | -0.107 | -0.029 |
| f26 | -0.0360 | 0.022 | -1.625 | 0.105 | -0.080 | 0.008 |
| f27 | -0.0062 | 0.019 | -0.324 | 0.746 | -0.044 | 0.031 |
| Omnibus: | 39.669 | Durbin-Watson: | 2.004 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 147.525 |
| Skew: | 0.090 | Prob(JB): | 9.23e-33 |
| Kurtosis: | 5.383 | Cond. No. | 1.38e+16 |
(b) (5%) The fitting of the linear regression is a good idea? If yes, why? If no, why? What’s the possible reason of poor fitting?
ans: In term of R-squared, 0.495, the data are not properly fitting of the linear regression. The reason may be there are too many variables and some of them even have high correlation.
(c) (5%) Based on the results, rank the independent variables by p-values and which one are statistically significant variables with p-values<0.01? (i.e. 重要變數挑選
result = est.fit()
rank_list = [[f'f{i}', value] for i, value in enumerate(result.pvalues[1:])]
rank_list = sorted(rank_list, key=lambda x: x[1], reverse=True)
rank_list
[['f12', 0.9785197859614896], ['f4', 0.927523354571141], ['f0', 0.8231798305691802], ['f27', 0.7464189632101811], ['f23', 0.6586476274404273], ['f3', 0.6547220406461669], ['f20', 0.5200766295845716], ['f24', 0.42379860715669404], ['f7', 0.39383906135123725], ['f19', 0.29381973022946517], ['f21', 0.2071130863070476], ['f13', 0.1585855950754609], ['f11', 0.1509174478955042], ['f1', 0.11674036608665479], ['f26', 0.10464646982813271], ['f16', 0.07959189207626395], ['f10', 0.03450385822319018], ['f8', 0.030366136511409656], ['f5', 0.013312066540726987], ['f6', 0.006295707579197186], ['f25', 0.0007041639436919736], ['f17', 0.000696777004896272], ['f22', 0.0004429124783520328], ['f15', 3.9713256771779584e-08], ['f9', 9.154186372719588e-11], ['f14', 6.896905393342516e-18], ['f2', 2.4303678500953402e-18], ['f18', 6.355894795964475e-19]]
From the result, we can inference f18, f2, f14, f15, f22, f17, f25, and f6 are statistically significant variables beacause their p-value is less than 0.01.
(d) (15%) Testify the underlying assumptions of regression (1) Normality, (2) Independence, and (3) Homogeneity of Variance with respect to residual.
# Normailty
print("Normailty: ")
shapiro_test = st.shapiro(result.resid)
print(shapiro_test)
print()
# Independence
print("Independence: ")
print(durbin_watson(result.resid))
print()
header = [
'Lagrange multiplier statistic',
'p-value',
'f-value',
'f p-value',
]
# Homogeneity of variance
print("Homogeneity of variance: ")
test = het_breuschpagan(result.resid, result.model.exog)
table = lzip(header, test)
print(table)
Normailty:
ShapiroResult(statistic=0.9345124363899231, pvalue=7.536043131996446e-16)
Independence:
2.004053480472211
Homogeneity of variance:
[('Lagrange multiplier statistic', 140.62034965182642), ('p-value', 5.871842930105079e-17), ('f-value', 6.431710999631292), ('f p-value', 5.311815675352488e-20)]
(1) (10%) How to handle the raw dataset via data preprocessing?
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
receipts_path = os.path.join('Assignment1_Data','MDS_Assignment1_groceries.csv')
receipts_lst = []
with open(receipts_path, 'r') as f:
receipts = f.readlines()
# item_set = list({item for receipt in receipts for item in receipt.replace('\n', '').split(',')})
# for receipt in receipts:
# receipt_lst = [item_set.index(item_name) for item_name in receipt.replace('\n', '').split(',')]
# receipts_lst.append(receipt_lst)
# for id_lst in receipts_lst:
# multi_hot_vec = [1 if i in id_lst else 0 for i in range(len(item_set)) ]
# multi_hot_lst.append(multi_hot_vec)
for receipt in receipts:
receipt_lst = [item_name for item_name in receipt.replace('\n', '').split(',')]
receipts_lst.append(receipt_lst)
te = TransactionEncoder()
te_ary = te.fit(receipts_lst).transform(receipts_lst)
df = pd.DataFrame(te_ary, columns=te.columns_)
ans:
step 1: load data from csv through pandas package.
step 2: saperate each item for evey line and put them in list.
Thus, we can get a 2-dimension list. For each sub-list in the list, it represents a certain transaction.
(2) (10%) What’s the top 5 association rules? Show the support, confidence, and lift to each specific rule, respectively?
frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules[
(rules['confidence'] >= 15e-2)
].sort_values(by=['lift'], ascending=False)
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | |
|---|---|---|---|---|---|---|---|---|---|
| 9816 | (liquor) | (bottled beer, red/blush wine) | 0.011083 | 0.004881 | 0.001932 | 0.174312 | 35.715787 | 1.877786e-03 | 1.205200 |
| 9813 | (bottled beer, red/blush wine) | (liquor) | 0.004881 | 0.011083 | 0.001932 | 0.395833 | 35.715787 | 1.877786e-03 | 1.636828 |
| 101206 | (oil, whole milk, tropical fruit) | (other vegetables, yogurt, root vegetables) | 0.002542 | 0.012913 | 0.001017 | 0.400000 | 30.976378 | 9.839526e-04 | 1.645145 |
| 101208 | (yogurt, oil, root vegetables) | (other vegetables, whole milk, tropical fruit) | 0.001932 | 0.017082 | 0.001017 | 0.526316 | 30.811404 | 9.837768e-04 | 2.075049 |
| 100941 | (other vegetables, domestic eggs, whole milk, ... | (butter, tropical fruit) | 0.003355 | 0.009964 | 0.001017 | 0.303030 | 30.411255 | 9.833426e-04 | 1.420486 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32351 | (frozen vegetables, pork) | (soda) | 0.006406 | 0.174377 | 0.001118 | 0.174603 | 1.001296 | 1.447369e-06 | 1.000274 |
| 7538 | (newspapers, beef) | (soda) | 0.006406 | 0.174377 | 0.001118 | 0.174603 | 1.001296 | 1.447369e-06 | 1.000274 |
| 4780 | (pip fruit) | (rolls/buns) | 0.075648 | 0.183935 | 0.013930 | 0.184140 | 1.001114 | 1.549719e-05 | 1.000251 |
| 41491 | (specialty chocolate, soda) | (other vegetables) | 0.006304 | 0.193493 | 0.001220 | 0.193548 | 1.000288 | 3.515039e-07 | 1.000069 |
| 5010 | (specialty chocolate) | (rolls/buns) | 0.030402 | 0.183935 | 0.005592 | 0.183946 | 1.000063 | 3.515039e-07 | 1.000014 |
32534 rows × 9 columns
ans:
The top 5 association rules ranking by lift is:
(3) (5%) Please provide/guess the “story” to interpret one of top-5 rules you are interested in.
ans:
I think people who bought chocolate tend to buy rolls/buns is pretty instinctive. It is a common combination no matter for breakfast or afternoon tea in western.
(4) (10%) Give a visualization graph of your association rules
from pycaret.utils import enable_colab
from pycaret.arules import *
enable_colab()
Colab mode enabled.
plot_model(rules, plot='3d')